We explore using random forests to identify which genes are most important in differentiating between eye tissue and other tissues.
# Variable to determine whether to use:
# FALSE: previously computed datasets
# TRUE: compute new datasets
freshRun = FALSE
We load in the necessary packages and data
library(dplyr)
library(ggplot2)
library(randomForest)
library(caret)
library(Rtsne)
load("~/Documents/git/unified_gene_expression/data/counts_processed.Rdata")
load("~/Documents/git/unified_gene_expression/data/core_metaData.Rdata")
expr = t(counts)
meta = core_tight
dim(expr)
## [1] 841 20330
dim(meta)
## [1] 9317 9
We left_join the meta sheet (for the sample_accession variable) and the expression matrix. We then subset the resulting matrix into RPE, Retina, and RPE+Retina tables.
if(freshRun){
sample_accession = rownames(expr)
expr.temp = as.data.frame(cbind(sample_accession, expr))
rownames(expr.temp)[1:5]
colnames(expr.temp)[1:5]
expr.meta = left_join(expr.temp, meta, by = "sample_accession") %>% dplyr::select(-run_accession) %>% distinct()
expr.temp$sample_accession = as.character(expr.temp$sample_accession)
expr.temp$sample_accession[1:5]
expr.meta$sample_accession[1:5]
identical(expr.temp$sample_accession, expr.meta$sample_accession)
rownames(expr.meta) = expr.meta$sample_accession
expr.meta[1:5, 1:3]
class(expr.meta)
table(expr.meta$Tissue)
expr.meta <- filter(expr.meta, Origin == "Tissue")
class(expr.meta)
table(expr.meta$Tissue)
expr.retina = filter(expr.meta, Tissue == "Retina")
expr.rpe = filter(expr.meta, Tissue == "RPE")
expr.retina.rpe = filter(expr.meta, Tissue == "Retina" | Tissue == "RPE")
expr.meta <- filter(expr.meta, Tissue != "Brain" & Tissue != "Cornea" & Tissue != "Retina" & Tissue != "RPE")
set.seed(47)
expr.comp.retina <- expr.meta %>% group_by(Tissue) %>% sample_n(size = 4) %>% ungroup() %>% as.data.frame()
set.seed(47)
expr.comp.rpe <- expr.meta %>% group_by(Tissue) %>% sample_n(size = 2) %>% ungroup() %>% as.data.frame()
set.seed(47)
expr.comp.retina.rpe <- expr.meta %>% group_by(Tissue) %>% sample_n(size = 6) %>% ungroup() %>% as.data.frame()
# Eye tissues
rownames(expr.retina) = expr.retina$sample_accession
rownames(expr.rpe) = expr.rpe$sample_accession
rownames(expr.retina.rpe) = expr.retina.rpe$sample_accession
expr.retina = expr.retina[, c(2:ncol(expr.retina), 1)]
expr.rpe = expr.rpe[, c(2:ncol(expr.rpe), 1)]
expr.retina.rpe = expr.retina.rpe[, c(2:ncol(expr.retina.rpe), 1)]
###
# Other tissues
rownames(expr.comp.retina) = expr.comp.retina$sample_accession
rownames(expr.comp.rpe) = expr.comp.rpe$sample_accession
rownames(expr.comp.retina.rpe) = expr.comp.retina.rpe$sample_accession
expr.comp.retina = expr.comp.retina[, c(2:ncol(expr.comp.retina), 1)]
expr.comp.rpe = expr.comp.rpe[, c(2:ncol(expr.comp.rpe), 1)]
expr.comp.retina.rpe = expr.comp.retina.rpe[, c(2:ncol(expr.comp.retina.rpe), 1)]
###
dim(expr.retina)
dim(expr.comp.retina)
dim(expr.rpe)
dim(expr.comp.rpe)
dim(expr.retina.rpe)
dim(expr.comp.retina.rpe)
save(expr.retina, file = "~/Documents/git/unified_gene_expression/data/RF_comparisons/counts_retina.Rdata")
save(expr.rpe, file = "~/Documents/git/unified_gene_expression/data/RF_comparisons/counts_rpe.Rdata")
save(expr.retina.rpe, file = "~/Documents/git/unified_gene_expression/data/RF_comparisons/counts_retina_rpe.Rdata")
save(expr.comp.retina, file = "~/Documents/git/unified_gene_expression/data/RF_comparisons/counts_composite_retina.Rdata")
save(expr.comp.rpe, file = "~/Documents/git/unified_gene_expression/data/RF_comparisons/counts_composite_rpe.Rdata")
save(expr.comp.retina.rpe, file = "~/Documents/git/unified_gene_expression/data/RF_comparisons/counts_composite_retina_rpe.Rdata")
}else{
load("~/Documents/git/unified_gene_expression/data/RF_comparisons/counts_retina.Rdata")
load("~/Documents/git/unified_gene_expression/data/RF_comparisons/counts_rpe.Rdata")
load("~/Documents/git/unified_gene_expression/data/RF_comparisons/counts_retina_rpe.Rdata")
load("~/Documents/git/unified_gene_expression/data/RF_comparisons/counts_composite_retina.Rdata")
load("~/Documents/git/unified_gene_expression/data/RF_comparisons/counts_composite_rpe.Rdata")
load("~/Documents/git/unified_gene_expression/data/RF_comparisons/counts_composite_retina_rpe.Rdata")
}
We now build the random forest for the retina samples. We find that with \(nTree = 1000\), the random forest perfectly classifies the samples. We find the 50 most important genes based on MeanDecreaseAccuracy and MeanDecreaseGini, and we find that 41 of the genes in these groups are the same.
#####################################
## Retina RF ##
#####################################
meta.retina.rf = expr.retina[, which(colnames(expr.retina) == "study_accession"):ncol(expr.retina)]
expr.retina.rf = expr.retina[, 1:(which(colnames(expr.retina) == "study_accession") - 1)]
meta.comp.rf = expr.comp.retina[, which(colnames(expr.comp.retina) == "study_accession"):ncol(expr.comp.retina)]
expr.comp.rf = expr.comp.retina[, 1:(which(colnames(expr.comp.retina) == "study_accession") - 1)]
expr.retina.rf[] = sapply(expr.retina.rf, as.numeric)
expr.comp.rf[] = sapply(expr.comp.rf, as.numeric)
meta.comp.rf$Tissue = rep("Composite", length(meta.comp.rf$Tissue))
# Combining data
meta.rf = bind_rows(meta.retina.rf, meta.comp.rf)
expr.rf = bind_rows(expr.retina.rf, expr.comp.rf)
# Random forest
set.seed(47)
rf.retina.comp <- randomForest(x = expr.rf, y = factor(meta.rf$Tissue), ntree = 1000, importance = TRUE, scale = FALSE)
rf.retina.comp
##
## Call:
## randomForest(x = expr.rf, y = factor(meta.rf$Tissue), ntree = 1000, importance = TRUE, scale = FALSE)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 142
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## Composite Retina class.error
## Composite 84 0 0
## Retina 0 87 0
varImpPlot(rf.retina.comp, scale = TRUE, n.var = 50, cex = 0.45, pt.cex = 0.75, main = "Top 50 Most Important Genes")
impt.retina = importance(rf.retina.comp, scale = TRUE)
sum(impt.retina[, "MeanDecreaseAccuracy"] >= qnorm(0.95, mean = 0, sd = 1))
## [1] 223
genes.acc = impt.retina[order(impt.retina[, "MeanDecreaseAccuracy"], decreasing = TRUE), ][1:50, ]
genes.gini = impt.retina[order(impt.retina[, "MeanDecreaseGini"], decreasing = TRUE), ][1:50, ]
length(intersect(rownames(genes.acc), rownames(genes.gini)))
## [1] 41
sig.genes.list.retina = rownames(impt.retina)[which(impt.retina[, "MeanDecreaseAccuracy"] >= qnorm(0.95, mean = 0, sd = 1))]
write.table(sig.genes.list.retina, file = "~/Documents/git/unified_gene_expression/results/RF_MDA_sig_retina.txt",
sep = "\n", row.names = F, col.names = F)
We now build the random forest for the RPE samples. We find that with \(nTree = 1000\), the random forest only misclassified 3 samples. We find the 50 most important genes based on MeanDecreaseAccuracy and MeanDecreaseGini, and we find that all 50 of the genes in these groups are the same.
#####################################
## RPE RF ##
#####################################
meta.rpe.rf = expr.rpe[, which(colnames(expr.rpe) == "study_accession"):ncol(expr.rpe)]
expr.rpe.rf = expr.rpe[, 1:(which(colnames(expr.rpe) == "study_accession") - 1)]
meta.comp.rf = expr.comp.rpe[, which(colnames(expr.comp.rpe) == "study_accession"):ncol(expr.comp.rpe)]
expr.comp.rf = expr.comp.rpe[, 1:(which(colnames(expr.comp.rpe) == "study_accession") - 1)]
expr.rpe.rf[] = sapply(expr.rpe.rf, as.numeric)
expr.comp.rf[] = sapply(expr.comp.rf, as.numeric)
meta.comp.rf$Tissue = rep("Composite", length(meta.comp.rf$Tissue))
# Combining data
meta.rf = bind_rows(meta.rpe.rf, meta.comp.rf)
expr.rf = bind_rows(expr.rpe.rf, expr.comp.rf)
# Random forest
set.seed(47)
rf.rpe.comp <- randomForest(x = expr.rf, y = factor(meta.rf$Tissue), ntree = 1000, importance = TRUE)
rf.rpe.comp
##
## Call:
## randomForest(x = expr.rf, y = factor(meta.rf$Tissue), ntree = 1000, importance = TRUE)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 142
##
## OOB estimate of error rate: 4.05%
## Confusion matrix:
## Composite RPE class.error
## Composite 42 0 0.00000
## RPE 3 29 0.09375
varImpPlot(rf.rpe.comp, scale = TRUE, n.var = 50, cex = 0.45, pt.cex = 0.75, main = "Top 50 Most Important Genes")
impt.rpe = importance(rf.rpe.comp, scale = TRUE)
sum(impt.rpe[, "MeanDecreaseAccuracy"] >= qnorm(0.95, mean = 0, sd = 1))
## [1] 116
genes.acc = impt.rpe[order(impt.rpe[, "MeanDecreaseAccuracy"], decreasing = TRUE), ][1:100, ]
genes.gini = impt.rpe[order(impt.rpe[, "MeanDecreaseGini"], decreasing = TRUE), ][1:50, ]
length(intersect(rownames(genes.acc), rownames(genes.gini)))
## [1] 50
sig.genes.list.rpe = rownames(impt.rpe)[which(impt.rpe[, "MeanDecreaseAccuracy"] >= qnorm(0.95, mean = 0, sd = 1))]
We now build the random forest for Retina and RPE together. We find that with \(nTree = 1000\), the random forest only misclassified 3 samples. We find the 50 most important genes based on MeanDecreaseAccuracy and MeanDecreaseGini, and we find that 41 of the genes in these groups are the same.
#####################################
## Retina+RPE RF ##
#####################################
meta.retina.rpe.rf = expr.retina.rpe[, which(colnames(expr.retina.rpe) == "study_accession"):ncol(expr.retina.rpe)]
expr.retina.rpe.rf = expr.retina.rpe[, 1:(which(colnames(expr.retina.rpe) == "study_accession") - 1)]
meta.comp.rf = expr.comp.retina.rpe[, which(colnames(expr.comp.retina.rpe) == "study_accession"):ncol(expr.comp.retina.rpe)]
expr.comp.rf = expr.comp.retina.rpe[, 1:(which(colnames(expr.comp.retina.rpe) == "study_accession") - 1)]
expr.retina.rpe.rf[] = sapply(expr.retina.rpe.rf, as.numeric)
expr.comp.rf[] = sapply(expr.comp.rf, as.numeric)
meta.comp.rf$Tissue = rep("Composite", length(meta.comp.rf$Tissue))
meta.retina.rpe.rf$Tissue = rep("Retina.RPE", length(meta.retina.rpe.rf$Tissue))
# Combining data
meta.rf = bind_rows(meta.retina.rpe.rf, meta.comp.rf)
expr.rf = bind_rows(expr.retina.rpe.rf, expr.comp.rf)
# Random forest
set.seed(47)
rf.retina.rpe.comp <- randomForest(x = expr.rf, y = factor(meta.rf$Tissue), ntree = 1000, importance = TRUE)
rf.retina.rpe.comp
##
## Call:
## randomForest(x = expr.rf, y = factor(meta.rf$Tissue), ntree = 1000, importance = TRUE)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 142
##
## OOB estimate of error rate: 1.22%
## Confusion matrix:
## Composite Retina.RPE class.error
## Composite 126 0 0.00000000
## Retina.RPE 3 116 0.02521008
varImpPlot(rf.retina.rpe.comp, scale = TRUE, n.var = 50, cex = 0.45, pt.cex = 0.75, main = "Top 50 Most Important Genes")
impt.retina.rpe = importance(rf.retina.rpe.comp, scale = TRUE)
sum(impt.retina.rpe[, "MeanDecreaseAccuracy"] >= qnorm(0.95, mean = 0, sd = 1))
## [1] 291
genes.acc = impt.retina.rpe[order(impt.retina.rpe[, "MeanDecreaseAccuracy"], decreasing = TRUE), ][1:50, ]
genes.gini = impt.retina.rpe[order(impt.retina.rpe[, "MeanDecreaseGini"], decreasing = TRUE), ][1:50, ]
length(intersect(rownames(genes.acc), rownames(genes.gini)))
## [1] 41
sig.genes.list.retina.rpe = rownames(impt.retina.rpe)[which(impt.retina.rpe[, "MeanDecreaseAccuracy"] >= qnorm(0.95, mean = 0, sd = 1))]
In the Leo Breiman implementation of random forests (the randomForest package we’re using), they propose using the mean decrease in accuracy as a sort of permutation test statistic that you can map to a standard normal distribution. As such, for validating the genes that appear “most important” in the random forests, let’s consider the genes for which their mean decrease in accuracy is statistically significant by permutation test.
load("~/Documents/git/unified_gene_expression/data/lengthScaledTPM_processed.Rdata")
load("~/Documents/git/unified_gene_expression/data/core_metaData.Rdata")
core_info = core_tight
lengthScaledTPM <- lengthScaledTPM_qsmooth_highExp_remove_lowGenes
lengthScaledTPM <- lengthScaledTPM[,!(is.na(lengthScaledTPM[1,]))]
perplexities = seq(10, 50, by = 10)
############################
# Retina
############################
for(p in perplexities){
# All genes
set.seed(47)
tsne_out <- Rtsne(as.matrix(log2(t(lengthScaledTPM)+1)),perplexity = p, check_duplicates = FALSE, theta=0.0 )
tsne_plot <- data.frame(tsne_out$Y)
tsne_plot$sample_accession <- colnames(lengthScaledTPM)
p1 = tsne_plot %>% left_join(.,core_info) %>%
ggplot(.,aes(x=X1,y=X2,colour=Tissue,shape=Tissue)) +
geom_point(size=4) + scale_shape_manual(values=c(0:20,35:50)) +
ggtitle(paste0("t-sne. Perplexity = ", p)) +
coord_cartesian(xlim = c(-50, 50), ylim = c(-50, 50))
print(p1)
# RF genes removed
sig.genes.indices = which(rownames(lengthScaledTPM) %in% sig.genes.list.retina)
lengthScaledTPM.sig.genes = lengthScaledTPM[-(sig.genes.indices), ]
set.seed(47)
tsne_out <- Rtsne(as.matrix(log2(t(lengthScaledTPM.sig.genes)+1)),perplexity = p, check_duplicates = FALSE, theta=0.0 )
tsne_plot <- data.frame(tsne_out$Y)
tsne_plot$sample_accession <- colnames(lengthScaledTPM.sig.genes)
p2 = tsne_plot %>% left_join(.,core_info) %>%
ggplot(.,aes(x=X1,y=X2,colour=Tissue,shape=Tissue)) +
geom_point(size=4) + scale_shape_manual(values=c(0:20,35:50)) +
ggtitle(paste0("t-sne. Perplexity = ", p)) +
coord_cartesian(xlim = c(-50, 50), ylim = c(-50, 50))
print(p2)
}
############################
# RPE
############################
for(p in perplexities){
# All genes
set.seed(47)
tsne_out <- Rtsne(as.matrix(log2(t(lengthScaledTPM)+1)),perplexity = p, check_duplicates = FALSE, theta=0.0 )
tsne_plot <- data.frame(tsne_out$Y)
tsne_plot$sample_accession <- colnames(lengthScaledTPM)
p1 = tsne_plot %>% left_join(.,core_info) %>%
ggplot(.,aes(x=X1,y=X2,colour=Tissue,shape=Tissue)) +
geom_point(size=4) + scale_shape_manual(values=c(0:20,35:50)) +
ggtitle(paste0("t-sne. Perplexity = ", p)) +
coord_cartesian(xlim = c(-50, 50), ylim = c(-50, 50))
print(p1)
# RF genes removed
sig.genes.indices = which(rownames(lengthScaledTPM) %in% sig.genes.list.rpe)
lengthScaledTPM.sig.genes = lengthScaledTPM[-(sig.genes.indices), ]
set.seed(47)
tsne_out <- Rtsne(as.matrix(log2(t(lengthScaledTPM.sig.genes)+1)),perplexity = p, check_duplicates = FALSE, theta=0.0 )
tsne_plot <- data.frame(tsne_out$Y)
tsne_plot$sample_accession <- colnames(lengthScaledTPM.sig.genes)
p2 = tsne_plot %>% left_join(.,core_info) %>%
ggplot(.,aes(x=X1,y=X2,colour=Tissue,shape=Tissue)) +
geom_point(size=4) + scale_shape_manual(values=c(0:20,35:50)) +
ggtitle(paste0("t-sne. Perplexity = ", p)) +
coord_cartesian(xlim = c(-50, 50), ylim = c(-50, 50))
print(p2)
}
############################
# Retina+RPE
############################
for(p in perplexities){
# All genes
set.seed(47)
tsne_out <- Rtsne(as.matrix(log2(t(lengthScaledTPM)+1)),perplexity = p, check_duplicates = FALSE, theta=0.0 )
tsne_plot <- data.frame(tsne_out$Y)
tsne_plot$sample_accession <- colnames(lengthScaledTPM)
p1 = tsne_plot %>% left_join(.,core_info) %>%
ggplot(.,aes(x=X1,y=X2,colour=Tissue,shape=Tissue)) +
geom_point(size=4) + scale_shape_manual(values=c(0:20,35:50)) +
ggtitle(paste0("t-sne. Perplexity = ", p)) +
coord_cartesian(xlim = c(-50, 50), ylim = c(-50, 50))
print(p1)
# RF genes removed
sig.genes.indices = which(rownames(lengthScaledTPM) %in% sig.genes.list.retina.rpe)
lengthScaledTPM.sig.genes = lengthScaledTPM[-(sig.genes.indices), ]
set.seed(47)
tsne_out <- Rtsne(as.matrix(log2(t(lengthScaledTPM.sig.genes)+1)),perplexity = p, check_duplicates = FALSE, theta=0.0 )
tsne_plot <- data.frame(tsne_out$Y)
tsne_plot$sample_accession <- colnames(lengthScaledTPM.sig.genes)
p2 = tsne_plot %>% left_join(.,core_info) %>%
ggplot(.,aes(x=X1,y=X2,colour=Tissue,shape=Tissue)) +
geom_point(size=4) + scale_shape_manual(values=c(0:20,35:50)) +
ggtitle(paste0("t-sne. Perplexity = ", p)) +
coord_cartesian(xlim = c(-50, 50), ylim = c(-50, 50))
print(p2)
}